knitr::opts_chunk$set(fig.width = 6, fig.height = 4, fig.path = 'Figs/',
                      echo = TRUE, message = FALSE, warning = FALSE)

library(tidyverse)
library(RColorBrewer)
library(stringr)

### Set up some options
options(stringsAsFactors = FALSE) ### Ensure strings come in as character types

### generic theme for all plots
ggtheme_plot <- function(base_size = 9) {
  theme(axis.ticks = element_blank(),
               text             = element_text(family = 'Helvetica', color = 'gray30', size = base_size),
               plot.title       = element_text(size = rel(1.25), hjust = 0, face = 'bold'),
               panel.background = element_blank(),
               legend.position  = 'right',
               panel.border     = element_blank(),
               panel.grid.minor = element_blank(),
               panel.grid.major = element_line(colour = 'grey90', size = .25),
               # panel.grid.major = element_blank(),
               legend.key       = element_rect(colour = NA, fill = NA),
               axis.line        = element_blank()) # element_line(colour = "grey30", size = .5))
}

1 Plot a few parameters against each other

1.1 Parameters vs temperature

z <- read_csv('data/water_qual_student.csv')

R_thresh <- .1

temp_df <- z %>%
  filter(param_desc == 'WATER TEMPERATURE DEG C') %>%
  filter(!is.na(MeasureValue)) %>%
  group_by(SampleDate) %>%
  summarize(temp = mean(MeasureValue))

params <- z$param_desc %>% unique() %>%
  .[!str_detect(., 'WATER TEMPERATURE DEG C')]

for (param in params) {
  ### param <- params[1]
  tmp <- z %>%
    filter(param_desc %in% param) %>%
    inner_join(temp_df, by = 'SampleDate') %>%
    select(FIPS, place_name, EventId, SampleDate, param_desc, MeasureValue, temp, Unit) %>%
    distinct() %>%
    arrange(SampleDate)
  
  if(nrow(tmp) > 500) {
    tmp <- tmp %>%
      filter(MeasureValue < quantile(MeasureValue, .995, na.rm = TRUE))
  }
  
  mdl_R <- lm(MeasureValue ~ temp, data = tmp) %>%
    summary()
  
  
  if(mdl_R$adj.r.squared > R_thresh) {
    plot_units <- tmp$Unit[!is.na(tmp$Unit)][1]
    plot_param_short  <- tmp$Parameter[1]
    
    param_plot <- ggplot(tmp, aes(x = temp, y = MeasureValue)) +
      geom_point(aes(color = place_name), alpha = .5) +
      stat_smooth(method = 'lm', color = 'grey20', size = .5) +
      labs(title = tools::toTitleCase(param),
           y = paste0(plot_param_short, ' (', plot_units, ')'),
           x = 'Temperature (°C)')
    
    print(param_plot)
  }
  cat(sprintf('%s vs Temp:<br>  R^2^ = %.4f<hr>', param, mdl_R$adj.r.squared))
}
WHOLE 5-DAY BIOCHEMICAL OXYGEN DEMAND MG/L vs Temp:
R2 = 0.0197
ACTIVE CHLOROPHYLL-A UG/L vs Temp:
R2 = 0.1386
DISSOLVED OXYGEN IN MG/L MG/L vs Temp:
R2 = 0.5506
HARDNESS AS CACO3 MG/L vs Temp:
R2 = 0.0063
AMMONIUM NITROGEN AS N (FILTERED SAMPLE) MG/L vs Temp:
R2 = 0.0053
NITRITE+NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Temp:
R2 = 0.0918
NITRITE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Temp:
R2 = 0.0233
NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Temp:
R2 = 0.1051
PH CORRECTED FOR TEMPERATURE (25 DEG C) SU vs Temp:
R2 = 0.0036
SALINITY UNITS IN PPT AND EQUAL TO PRACTICAL SALNITY UNITS (PSU) PPT vs Temp:
R2 = 0.0692
SECCHI DEPTH M vs Temp:
R2 = 0.0140
TOTAL ALKALINITY AS CACO3 MG/L vs Temp:
R2 = 0.0163
TOTAL DISSOLVED NITROGEN MG/L vs Temp:
R2 = 0.0962
TOTAL DISSOLVED PHOSPHORUS MG/L vs Temp:
R2 = 0.0035
TOTAL SUSPENDED SOLIDS MG/L vs Temp:
R2 = 0.0060
TURBIDITY; NEPHELOMETRIC METHOD NTU vs Temp:
R2 = 0.0096

1.2 Parameters vs oxygen

z <- read_csv('data/water_qual_student.csv')

R_thresh <- .1

o2_df <- z %>%
  filter(param_desc == 'DISSOLVED OXYGEN IN MG/L MG/L') %>%
  filter(!is.na(MeasureValue)) %>%
  group_by(SampleDate) %>%
  summarize(o2 = mean(MeasureValue))

params <- z$param_desc %>% unique() %>%
  .[!str_detect(., 'DISSOLVED OXYGEN')]

for (param in params) {
  ### param <- params[1]
  tmp <- z %>%
    filter(param_desc %in% param) %>%
    inner_join(o2_df, by = 'SampleDate') %>%
    select(FIPS, place_name, EventId, SampleDate, param_desc, MeasureValue, o2, Unit) %>%
    distinct() %>%
    arrange(SampleDate)
  
  if(nrow(tmp) > 500) {
    tmp <- tmp %>%
      filter(MeasureValue < quantile(MeasureValue, .995, na.rm = TRUE))
  }
  
  mdl_R <- lm(MeasureValue ~ o2, data = tmp) %>%
    summary()
  
  if(mdl_R$adj.r.squared > R_thresh) {
    plot_units <- tmp$Unit[!is.na(tmp$Unit)][1]
    plot_param_short  <- tmp$Parameter[1]
    
    param_plot <- ggplot(tmp, aes(x = o2, y = MeasureValue)) +
      geom_point(aes(color = place_name), alpha = .5) +
      stat_smooth(method = 'lm', color = 'grey20', size = .5) +
      labs(title = tools::toTitleCase(param),
           y = paste0(plot_param_short, ' (', plot_units, ')'),
           x = 'Dissolved Oxygen (mg/L)')
    
    print(param_plot)
  }
  cat(sprintf('%s vs Dissolved Oxygen:<br>  R^2^ = %.4f<hr>', param, mdl_R$adj.r.squared))

}
WHOLE 5-DAY BIOCHEMICAL OXYGEN DEMAND MG/L vs Dissolved Oxygen:
R2 = 0.0243
ACTIVE CHLOROPHYLL-A UG/L vs Dissolved Oxygen:
R2 = 0.0699
HARDNESS AS CACO3 MG/L vs Dissolved Oxygen:
R2 = 0.0346
AMMONIUM NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Oxygen:
R2 = 0.0025
NITRITE+NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Oxygen:
R2 = 0.1068
NITRITE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Oxygen:
R2 = 0.0581
NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Oxygen:
R2 = 0.1074
PH CORRECTED FOR TEMPERATURE (25 DEG C) SU vs Dissolved Oxygen:
R2 = 0.0017
SALINITY UNITS IN PPT AND EQUAL TO PRACTICAL SALNITY UNITS (PSU) PPT vs Dissolved Oxygen:
R2 = 0.0614
SECCHI DEPTH M vs Dissolved Oxygen:
R2 = 0.0381
TOTAL ALKALINITY AS CACO3 MG/L vs Dissolved Oxygen:
R2 = 0.0035
TOTAL DISSOLVED NITROGEN MG/L vs Dissolved Oxygen:
R2 = 0.0973
TOTAL DISSOLVED PHOSPHORUS MG/L vs Dissolved Oxygen:
R2 = -0.0014
TOTAL SUSPENDED SOLIDS MG/L vs Dissolved Oxygen:
R2 = 0.0026
TURBIDITY; NEPHELOMETRIC METHOD NTU vs Dissolved Oxygen:
R2 = -0.0000
WATER TEMPERATURE DEG C vs Dissolved Oxygen:
R2 = 0.6943

1.3 Parameters vs nitrogen

z <- read_csv('data/water_qual_student.csv')

R_thresh <- .1

n2_df <- z %>%
  filter(str_detect(param_desc, 'DISSOLVED NITROGEN')) %>%
  filter(!is.na(MeasureValue)) %>%
  group_by(SampleDate) %>%
  summarize(n2 = mean(MeasureValue))

params <- z$param_desc %>% unique() %>%
  .[!str_detect(., 'DISSOLVED NITROGEN')]

for (param in params) {
  ### param <- params[1]
  tmp <- z %>%
    filter(param_desc %in% param) %>%
    inner_join(n2_df, by = 'SampleDate') %>%
    select(FIPS, place_name, EventId, SampleDate, param_desc, MeasureValue, n2, Unit) %>%
    distinct() %>%
    arrange(SampleDate)
  
  if(nrow(tmp) > 500) {
    tmp <- tmp %>%
      filter(MeasureValue < quantile(MeasureValue, .995, na.rm = TRUE))
  }
  
  mdl_R <- lm(MeasureValue ~ n2, data = tmp) %>%
    summary()
  
  if(mdl_R$adj.r.squared > R_thresh) {

    plot_units <- tmp$Unit[!is.na(tmp$Unit)][1]
    plot_param_short  <- tmp$Parameter[1]
    
    param_plot <- ggplot(tmp, aes(x = n2, y = MeasureValue)) +
      geom_point(aes(color = place_name), alpha = .5) +
      stat_smooth(method = 'lm', color = 'grey20', size = .5) +
      labs(title = tools::toTitleCase(param),
           y = paste0(plot_param_short, ' (', plot_units, ')'),
           x = 'Dissolved Nitrogen (mg/L)')
    
    print(param_plot)
  }
  cat(sprintf('%s vs Dissolved Nitrogen:<br>  R^2^ = %.4f<hr>', param, mdl_R$adj.r.squared))

}
WHOLE 5-DAY BIOCHEMICAL OXYGEN DEMAND MG/L vs Dissolved Nitrogen:
R2 = 0.0070
ACTIVE CHLOROPHYLL-A UG/L vs Dissolved Nitrogen:
R2 = 0.0066
DISSOLVED OXYGEN IN MG/L MG/L vs Dissolved Nitrogen:
R2 = 0.1415
HARDNESS AS CACO3 MG/L vs Dissolved Nitrogen:
R2 = -0.0040
AMMONIUM NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Nitrogen:
R2 = 0.0767
NITRITE+NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Nitrogen:
R2 = 0.3626
NITRITE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Nitrogen:
R2 = 0.0367
NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Nitrogen:
R2 = 0.3513
PH CORRECTED FOR TEMPERATURE (25 DEG C) SU vs Dissolved Nitrogen:
R2 = 0.0198
SALINITY UNITS IN PPT AND EQUAL TO PRACTICAL SALNITY UNITS (PSU) PPT vs Dissolved Nitrogen:
R2 = 0.0590
SECCHI DEPTH M vs Dissolved Nitrogen:
R2 = 0.0143
TOTAL ALKALINITY AS CACO3 MG/L vs Dissolved Nitrogen:
R2 = 0.0254
TOTAL DISSOLVED PHOSPHORUS MG/L vs Dissolved Nitrogen:
R2 = 0.0177
TOTAL SUSPENDED SOLIDS MG/L vs Dissolved Nitrogen:
R2 = 0.0201
TURBIDITY; NEPHELOMETRIC METHOD NTU vs Dissolved Nitrogen:
R2 = 0.0268
WATER TEMPERATURE DEG C vs Dissolved Nitrogen:
R2 = 0.1974

1.4 Parameters vs phosphorus

z <- read_csv('data/water_qual_student.csv')

R_thresh <- .1

p_df <- z %>%
  filter(str_detect(param_desc, 'DISSOLVED PHOSPHORUS')) %>%
  filter(!is.na(MeasureValue)) %>%
  group_by(SampleDate) %>%
  summarize(p = mean(MeasureValue)) %>%
  filter(p < quantile(p, .99))

params <- z$param_desc %>% unique() %>%
  .[!str_detect(., 'DISSOLVED PHOSPHORUS')]

for (param in params) {
  ### param <- params[1]
  tmp <- z %>%
    filter(param_desc %in% param) %>%
    inner_join(p_df, by = 'SampleDate') %>%
    select(FIPS, place_name, EventId, SampleDate, param_desc, MeasureValue, p, Unit) %>%
    distinct() %>%
    arrange(SampleDate)
  
  if(nrow(tmp) > 500) {
    tmp <- tmp %>%
      filter(MeasureValue < quantile(MeasureValue, .995, na.rm = TRUE))
  }
  
  mdl_R <- lm(MeasureValue ~ p, data = tmp) %>%
    summary()
  
  if(mdl_R$adj.r.squared > R_thresh) {
  
    plot_units <- tmp$Unit[!is.na(tmp$Unit)][1]
    plot_param_short  <- tmp$Parameter[1]
    
    param_plot <- ggplot(tmp, aes(x = p, y = MeasureValue)) +
      geom_point(aes(color = place_name), alpha = .5) +
      stat_smooth(method = 'lm', color = 'grey20', size = .5) +
      labs(title = tools::toTitleCase(param),
           y = paste0(plot_param_short, ' (', plot_units, ')'),
           x = 'Dissolved Phosphorus (mg/L)')
    
    print(param_plot)
  }
  cat(sprintf('%s vs Dissolved Phosphorus:<br>  R^2^ = %.4f<hr>', param, mdl_R$adj.r.squared))

}
WHOLE 5-DAY BIOCHEMICAL OXYGEN DEMAND MG/L vs Dissolved Phosphorus:
R2 = 0.1360
ACTIVE CHLOROPHYLL-A UG/L vs Dissolved Phosphorus:
R2 = 0.0186
DISSOLVED OXYGEN IN MG/L MG/L vs Dissolved Phosphorus:
R2 = 0.0029
HARDNESS AS CACO3 MG/L vs Dissolved Phosphorus:
R2 = 0.0754
AMMONIUM NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Phosphorus:
R2 = 0.1170
NITRITE+NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Phosphorus:
R2 = 0.0209
NITRITE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Phosphorus:
R2 = 0.1398
NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Dissolved Phosphorus:
R2 = 0.0251
PH CORRECTED FOR TEMPERATURE (25 DEG C) SU vs Dissolved Phosphorus:
R2 = 0.0744
SALINITY UNITS IN PPT AND EQUAL TO PRACTICAL SALNITY UNITS (PSU) PPT vs Dissolved Phosphorus:
R2 = 0.0078
SECCHI DEPTH M vs Dissolved Phosphorus:
R2 = 0.0656
TOTAL ALKALINITY AS CACO3 MG/L vs Dissolved Phosphorus:
R2 = -0.0021
TOTAL DISSOLVED NITROGEN MG/L vs Dissolved Phosphorus:
R2 = 0.0256
TOTAL SUSPENDED SOLIDS MG/L vs Dissolved Phosphorus:
R2 = 0.1065
TURBIDITY; NEPHELOMETRIC METHOD NTU vs Dissolved Phosphorus:
R2 = 0.1935
WATER TEMPERATURE DEG C vs Dissolved Phosphorus:
R2 = 0.0136

1.5 Parameters vs turbidity

z <- read_csv('data/water_qual_student.csv')

R_thresh <- .1

turb_df <- z %>%
  filter(str_detect(param_desc, 'TURBIDITY')) %>%
  filter(!is.na(MeasureValue)) %>%
  group_by(SampleDate) %>%
  summarize(turb = mean(MeasureValue)) %>%
  filter(turb < quantile(turb, .99))

params <- z$param_desc %>% unique() %>%
  .[!str_detect(., 'TURBIDITY')]

for (param in params) {
  ### param <- params[1]
  tmp <- z %>%
    filter(param_desc %in% param) %>%
    inner_join(turb_df, by = 'SampleDate') %>%
    select(FIPS, place_name, EventId, SampleDate, param_desc, MeasureValue, turb, Unit) %>%
    distinct() %>%
    arrange(SampleDate)
  
  if(nrow(tmp) > 500) {
    tmp <- tmp %>%
      filter(MeasureValue < quantile(MeasureValue, .995, na.rm = TRUE))
  }

  mdl_R <- lm(MeasureValue ~ turb, data = tmp) %>%
    summary()
  
  if(mdl_R$adj.r.squared > R_thresh) {
    
    plot_units <- tmp$Unit[!is.na(tmp$Unit)][1]
    plot_param_short  <- tmp$Parameter[1]
    
    param_plot <- ggplot(tmp, aes(x = turb, y = MeasureValue)) +
      geom_point(aes(color = place_name), alpha = .5) +
      stat_smooth(method = 'lm', color = 'grey20', size = .5) +
      labs(title = tools::toTitleCase(param),
           y = paste0(plot_param_short, ' (', plot_units, ')'),
           x = 'Turbidity (NTU)')
    
    print(param_plot)
  }
  cat(sprintf('%s vs Turbidity:<br>  R^2^ = %.4f<hr>', param, mdl_R$adj.r.squared))

}
WHOLE 5-DAY BIOCHEMICAL OXYGEN DEMAND MG/L vs Turbidity:
R2 = 0.0651
ACTIVE CHLOROPHYLL-A UG/L vs Turbidity:
R2 = 0.0152
DISSOLVED OXYGEN IN MG/L MG/L vs Turbidity:
R2 = -0.0002
HARDNESS AS CACO3 MG/L vs Turbidity:
R2 = 0.0725
AMMONIUM NITROGEN AS N (FILTERED SAMPLE) MG/L vs Turbidity:
R2 = 0.0074
NITRITE+NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Turbidity:
R2 = 0.0277
NITRITE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Turbidity:
R2 = 0.0030
NITRATE NITROGEN AS N (FILTERED SAMPLE) MG/L vs Turbidity:
R2 = 0.0208
PH CORRECTED FOR TEMPERATURE (25 DEG C) SU vs Turbidity:
R2 = 0.0385
SALINITY UNITS IN PPT AND EQUAL TO PRACTICAL SALNITY UNITS (PSU) PPT vs Turbidity:
R2 = 0.0046
SECCHI DEPTH M vs Turbidity:
R2 = 0.1311
TOTAL ALKALINITY AS CACO3 MG/L vs Turbidity:
R2 = 0.1068
TOTAL DISSOLVED NITROGEN MG/L vs Turbidity:
R2 = 0.0257
TOTAL DISSOLVED PHOSPHORUS MG/L vs Turbidity:
R2 = 0.0502
TOTAL SUSPENDED SOLIDS MG/L vs Turbidity:
R2 = 0.3017
WATER TEMPERATURE DEG C vs Turbidity:
R2 = 0.0062